#!/usr/bin/env python 
#coding: utf-8

############################
__author__ = "kauu (kauu@yahoo.cn)"
__version__ = "1.0"
__copyright__ = "Copyright (c) 2008 "
__license__ = "NO"



import re,urllib2, time
from BeautifulSoup import BeautifulSoup
import logging as log
import md5,os
from datetime import datetime

from crawldb import DB_crawl as DB

def rss(url,db = None):
	if not url : 
		log.warn('no rss url path ...')	
		return 1
	if not db :
		log.error('no db connection !!')
		return 1
	import feedparser
	rss = feedparser.parse(url)
	for item in rss['entries']:
		title =  item['title']
		summary =  item['summary']
		publish =  item['updated_parsed']
		url = item['link']
		md = md5.new(url).hexdigest()
		content = parse_content(url)
		print len(content)
		record={'title':title.encode('utf-8'),
			'content':content,
			'summary':summary.encode('utf-8'),
			'publish':datetime.now(),
			'init_url':url,
			'md5':md}
		#print record['content']
		try:
			db.insert('article',record)
		except Exception,string:
			print string
		else:
			print 'insert db OK!!!'
		

def parse_content(url):
	html = crawl(url)
	p = BeautifulSoup(html)
	content = p.find('div',{'id':'sohu_content'})
	con = ''
	for cc in content : 
		con += str(cc)
	return con



def crawl(url):
	user_agent = "User-Agent:Mozilla/5.0 "\
			+ "(Windows; U; Windows NT 5.1: en-GB;rv:1.8.1.4)"\
			+ "Gecko/20070515 Firefox/2.0.0.4"

	request = urllib2.Request(url)
	request.add_header('User-Agent',user_agent)	
	socket = urllib2.urlopen(request)
	html = socket.read()	
	socket.close()
	return html 
		
test_url ='http://rss.auto.sohu.com/rss/qichexinwen.xml' 


if __name__=='__main__':
	db = DB()
	rss(test_url,db)
